Note: Datset donated by Ron Kohavi and Barry Becker, from the article "Scaling Up the Accuracy of Naive-Bayes Classifiers: A Decision-Tree Hybrid". Small changes to the dataset have been made, such as removing the
'fnlwgt'feature and records with missing or ill-formatted entries.
import numpy as np # Package for numerical computing with Python
import pandas as pd # Package to work with data in tabular form and the like
from scipy.stats import skew
from time import time # Package to work with time values
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt # Package for plotting
import seaborn as sns # Package for plotting, prettier than matplotlib
import visuals as vs # Adapted from Udacity
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
# iPython Notebook formatting
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Account for changes made to imported packages
%load_ext autoreload
%autoreload 2
data = pd.read_csv("census.csv")
Featureset Exploration
data.info(null_counts=True) # Show each d-type of column
pd.DataFrame(data.isna().sum()).T
data.describe(include='all').T # Summarize each factor, transpose the summary (personal preference)
n_records = data.shape[0] # First element of .shape indicates n
n_greater_50k = data[data['income'] == '>50K'].shape[0] # n of those with income > 50k
n_at_most_50k = data.where(data['income'] == '<=50K').dropna().shape[0] # .where method requires dropping of na for this
greater_percent = round((n_greater_50k / n_records)*100,2) # Show proportion of > 50k to whole data
data_details = {"Number of observations": n_records,
"Number of people with income > 50k": n_greater_50k,
"Number of people with income <= 50k": n_at_most_50k,
"Percent of people with income > 50k": greater_percent} # Cache values of analysis
for item in data_details: # Iterate through the cache
print("{0}: {1}".format(item, data_details[item])) # Print the values
Before this data can be used for modeling and application to machine learning algorithms, it must be cleaned, formatted, and structured.
Split the data into features and labels
income_raw = data['income']
features_raw = data.drop('income', axis=1)
The features capital-gain and capital-loss are positively skewed (i.e. have a long tail in the positive direction).
Overly skewed data can influence the outcomes of statistical models. Very large, or very small, values can negatively affect a model's performance.
To reduce this skew, a logarithmic transformation, $\tilde x = \ln\left(x\right)$, can be applied. This transformation will reduce the amount of variance and pull the mean closer to the center of the distribution.
fig = make_subplots(rows=2, cols=1)
fig.add_trace(
go.Histogram(x=data['capital-loss'], nbinsx=25,
name='Capital-Loss'),
row=1, col=1
)
fig.add_trace(
go.Histogram(x=data['capital-gain'], nbinsx=25,
name='Capital-Gain'),
row=2, col=1
)
fig.update_xaxes(title_text="Capital-Loss Feature Distribution", row=1, col=1)
fig.update_xaxes(title_text="Capital-Gain Feature Distribution", row=2, col=1)
fig.update_yaxes(title_text="Number of Records", range=[0, 2000], row=1, col=1)
fig.update_yaxes(title_text="Number of Records", range=[0, 2000], row=2, col=1)
fig.update_layout(height=800, width=1000,
title_text="Skewed Distributions of Continuous Census Data Features",
showlegend=False,
yaxis = dict(
tickmode = 'array',
tickvals = [0, 500, 1000, 1500, 2000],
ticktext = [0, 500, 1000, 1500, ">2000"]
)
)
fig.show()
cap_loss_skew = skew(data['capital-loss'])
cap_loss_var = np.var(data['capital-loss'])
cap_loss_mean = np.mean(data['capital-loss'])
cap_gain_skew = skew(data['capital-gain'])
cap_gain_var = np.var(data['capital-gain'])
cap_gain_mean = np.mean(data['capital-gain'])
fac_df = pd.DataFrame({'Feature': ['Capital Loss', 'Capital Gain'],
'Skewness': [cap_loss_skew, cap_gain_skew],
'Mean': [cap_loss_mean, cap_gain_mean],
'Variance': [cap_loss_var, cap_gain_var]})
display(fac_df)
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data=features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x : np.log(x + 1))
fig = make_subplots(rows=2, cols=1)
fig.update_layout(height=800, width=1000,
title_text="Skewed Distributions of Continuous Census Data Features",
showlegend=False
)
fig.add_trace(
go.Histogram(x=features_log_transformed['capital-loss'], nbinsx=25,
name='Log of Capital-Loss'),
row=1, col=1
)
fig.add_trace(
go.Histogram(x=features_log_transformed['capital-gain'], nbinsx=25,
name='Log of Capital-Gain'),
row=2, col=1
)
fig.update_xaxes(title_text="Log of Capital-Loss Feature Distribution", row=1, col=1)
fig.update_xaxes(title_text="Log of Capital-Gain Feature Distribution", row=2, col=1)
fig.update_yaxes(title_text="Number of Records", range=[0, 2000],
patch = dict(
tickmode = 'array',
tickvals = [0, 500, 1000, 1500, 2000],
ticktext = [0, 500, 1000, 1500, ">2000"]),
row=1, col=1)
fig.update_yaxes(title_text="Number of Records", range=[0, 2000],
patch = dict(
tickmode = 'array',
tickvals = [0, 500, 1000, 1500, 2000],
ticktext = [0, 500, 1000, 1500, ">2000"]),
row=2, col=1)
fig.show()
log_cap_loss_skew = skew(features_log_transformed['capital-loss'])
log_cap_loss_var = round(np.var(features_log_transformed['capital-loss']),5)
log_cap_loss_mean = np.mean(features_log_transformed['capital-loss'])
log_cap_gain_skew = skew(features_log_transformed['capital-gain'])
log_cap_gain_var = round(float(np.var(features_log_transformed['capital-gain'])),5)
log_cap_gain_mean = np.mean(features_log_transformed['capital-gain'])
log_fac_df = pd.DataFrame({'Feature': ['Log Capital Loss', 'Log Capital Gain'],
'Skewness': [log_cap_loss_skew, log_cap_gain_skew],
'Mean': [log_cap_loss_mean, log_cap_gain_mean],
'Variance': [log_cap_loss_var, log_cap_gain_var]})
fac_df = fac_df.append(log_fac_df, ignore_index=True)
fac_df['Variance'] = fac_df['Variance'].apply(lambda x: '%.5f' % x)
display(fac_df)
| Feature | Skewness | Mean | Variance |
|---|---|---|---|
| Capital Loss | 4.516154 | 88.595418 | 163985.81018 |
| Capital Gain | 11.788611 | 1101.430344 | 56345246.60482 |
| Log Capital Loss | 4.271053 | 0.355489 | 2.54688 |
| Log Capital Gain | 3.082284 | 0.740759 | 6.08362 |
# # Full Page - Code
!jupyter nbconvert Polishing_Donor_Classification.ipynb --output WIP_Class_Code --reveal-prefix=reveal.js --SlidesExporter.reveal_theme=serif --SlidesExporter.reveal_scroll=True --SlidesExporter.reveal_transition=none
# # Full Page - No Code
!jupyter nbconvert Polishing_Donor_Classification.ipynb --output WIP_Class_No_Code --reveal-prefix=reveal.js --SlidesExporter.reveal_theme=serif --SlidesExporter.reveal_scroll=True --SlidesExporter.reveal_transition=none --TemplateExporter.exclude_input=True
# # Slides - No Code
!jupyter nbconvert --to slides Polishing_Donor_Classification.ipynb --output WIP_Class_No_Code_Slides --TemplateExporter.exclude_input=True --SlidesExporter.reveal_transition=none